Packages
Click to expand
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'igraph'
##
## The following objects are masked from 'package:lubridate':
##
## %--%, union
##
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
##
## The following objects are masked from 'package:purrr':
##
## compose, simplify
##
## The following object is masked from 'package:tidyr':
##
## crossing
##
## The following object is masked from 'package:tibble':
##
## as_data_frame
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
Functions
Click to expand
# load data functions
read_snp_csv <- function(distance_matrix_csv_path){
tryCatch({
snp_data <- read_csv(distance_matrix_csv_path, show_col_types = F)
}, error = function(e) {
stop("Error reading the SNP data file. Please check that it is a valid CSV file.")
})
# check valid matrix
if(nrow(snp_data) + 1 != ncol(snp_data)){
stop('Number of rows and cols in snp data must be the same')
}
if(!'Name' %in% names(snp_data)){
stop('Name column not in snp_data, is it from Pathogenwatch?')
}
return(
snp_data %>%
pivot_longer(cols = !Name, values_to = 'dist', names_to = 'iso2') %>%
rename(iso1=Name)
)
}
read_kleborate_data_csv <- function(kleborate_data_path){
tryCatch({
kleborate_data <- read_csv(kleborate_data_path, show_col_types = F)
}, error = function(e) {
stop("Error reading the kleborate data file. Please check that it is a valid CSV file.")
})
# TODO: validate kleborate data columns
return(kleborate_data)
}
read_metadata_csv <- function(metadata_path){
tryCatch({
metadata <- read_csv(metadata_path, show_col_types = F)
}, error = function(e) {
stop("Error reading the metadata file. Please check that it is a valid CSV file.")
})
if (! all(REQUIRED_METADATA_COLS %in% names(metadata)) ) {
stop(paste0("The following columns are required in the metadata file: ",
paste(REQUIRED_METADATA_COLS, collapse = ", ")))
}
return(metadata)
}
format_sample_dates <- function(metadata){
if(! all(c('id', 'Year', 'Month', 'Day') %in% names(metadata)) ) {
stop(paste0("'id', 'Year', 'Month', and 'Day' columns are required."))
}
sample_dates <- metadata %>% dplyr::select(id, Year, Month, Day) %>%
dplyr::filter(!(is.na(Year) | is.na(Month) | is.na(Day))) %>%
dplyr::rowwise() %>%
dplyr::mutate(formatted_date = paste0(Year, "-", Month, "-", Day)) %>%
dplyr::mutate(formatted_date = lubridate::as_date(formatted_date)) %>%
# dplyr::mutate(decimal_date = lubridate::decimal_date(formatted_date)) %>%
dplyr::select(id, formatted_date) %>%
dplyr::ungroup()
return(sample_dates)
}
# cluster functions
get_snp_and_epi_data <- function(snp_data, sample_dates, metadata,
geo_column = "Country"){
if (length(geo_column) > 1){stop("Can only supply one geo_column")}
if(!geo_column %in% names(metadata)){
stop(glue::glue("The specified '{geo_column}' column is missing from metadata"))
}
geo_data <- metadata %>% select(id, all_of(geo_column))
colnames(geo_data) <- c("id", "geo")
snp_and_epi_data <- snp_data %>%
# days between isolation
dplyr::left_join(sample_dates, by = c('iso1' = 'id')) %>% dplyr::rename('iso1_date' = 'formatted_date') %>%
dplyr::left_join(sample_dates, by = c('iso2' = 'id')) %>% dplyr::rename('iso2_date' = 'formatted_date') %>%
dplyr::mutate(days = abs(floor(as.numeric(
difftime(iso1_date, iso2_date, units = "days")
)))) %>%
dplyr::select(-c(iso1_date, iso2_date)) %>%
# check isolates with same or different location
dplyr::left_join(geo_data, by = c('iso1' = 'id')) %>% dplyr::rename('geo1' = 'geo') %>%
dplyr::left_join(geo_data, by = c('iso2' = 'id')) %>% dplyr::rename('geo2' = 'geo') %>%
dplyr::mutate(pair_location = if_else(geo1 == geo2, "Same", "Different")) %>%
dplyr::select(-c(geo1, geo2))
return(snp_and_epi_data)
}
#' Create a graph from table with distances and a geographic column
#' @param distance_data A tibble with columns: 'iso1', 'iso2',
#' and one or more columns containing distances (e.g., SNPs)
#' between the isolates in the 'iso1' and 'iso2' columns
#' @param dist_columns Vector of names of each of the columns containing distances
#' e.g., dist_columns = c('SNPs', 'days')
#' @param dist_thresholds Vector of distance thresholds [int] to be applied for each distance column
#' Length of this vector must be equal to the length of dist_columns
#' e.g., dist_threshold = c(10, 14)
#' @param pair_location_column Name of the geographic column. Column must contain either of the
#' values "Same" or "Different" representing isolate pairs from the same or different locations
#' @export
get_cluster_graph <- function(snp_and_epi_data, dist_columns, dist_thresholds,
pair_location_column = "pair_location", directed=FALSE) {
if (length(dist_columns) != length(dist_thresholds)) {
stop("The lengths of vars `dist_columns` and `dist_thresholds` must be equal")
}
if(!all(dist_columns %in% names(snp_and_epi_data))){
stop('One or more specified columns missing from snp_and_epi_data')
}
if(!pair_location_column %in% names(snp_and_epi_data)){
stop(glue::glue("The specified '{pair_location_column}' column is missing from snp_and_epi_data"))
}
if(!tibble::is_tibble(snp_and_epi_data)){stop('snp_and_epi_data is not a tibble')}
# filter out pairs based on the threshold for each distance column
for (i in seq(1:length(dist_columns))) {
snp_and_epi_data %<>%
dplyr::filter(!!rlang::sym(dist_columns[i]) <= dist_thresholds[i]) %>%
dplyr::select(-c(!!rlang::sym(dist_columns[i])))
}
# filter out pairs with same isolate
# filter out pairs belonging to the same geo location
# generate graph
snp_and_epi_data %>%
dplyr::filter(! iso1 == iso2) %>%
dplyr::filter(!!rlang::sym(pair_location_column) == "Same") %>%
dplyr::select(c(iso1, iso2)) %>%
as.matrix() %>%
igraph::graph_from_edgelist(., directed = directed)
}
# adapted from KleborateR function; removed dependency on kleborate data
get_cluster_membership_from_graph <- function(cluster_graph) {
if(!igraph::is_igraph(cluster_graph)){stop('cluster_graph is not an igraph')}
igraph::components(cluster_graph)$membership %>%
tibble::as_tibble(rownames = "id") %>%
dplyr::mutate(Cluster = as.character(value)) %>%
dplyr::select(!value)
}
calculate_cluster_proportion <- function(clusters_data){
clusters_data %>%
dplyr::summarise(
cluster_proportion = round(sum(!is.na(Cluster)) / n(), 2)
) %>% dplyr::pull(cluster_proportion)
}
# plotting functions
plot_dist_distribution <- function(distance_data, dist_column, x_label = "Pairwise distance",
plot_title = "Distribution of pairwise distances",
scale_y = F, facet_plot = F, facet_column = NULL){
if(! all(c('iso1', 'iso2') %in% colnames(distance_data)) ) {
stop("'iso1' and 'iso2' columns required")
}
if(! dist_column %in% colnames(distance_data) ) {
stop(glue::glue("'{dist_column}' column missing from distance_data"))
}
if(facet_plot){
if(is.null(facet_column)){
stop("'facet_plot' set to TRUE but facet column not provided")
}
if(! facet_column %in% colnames(distance_data) ) {
stop(glue::glue("'{facet_column}' column missing from distance_data"))
}
}
# plot
dist_plot <- ggplot(distance_data, aes(x = !!rlang::sym(dist_column))) +
ggplot2::geom_histogram(binwidth = 10, color = "black", alpha = 0.8) +
ggplot2::labs(title = plot_title, x = x_label, y = "Number of isolate pairs") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text = element_text(size = 14)) +
ggplot2::theme(axis.title = element_text(size = 14)) +
ggplot2::theme(legend.text = element_text(size = 14))
if (scale_y){
dist_plot <- dist_plot + scale_y_log10(labels = scales::comma_format())
}
if (facet_plot){
dist_plot <- dist_plot +
facet_wrap(~pair_location, nrow=2, scales="free_y")
}
return(dist_plot)
}
# Adapted from KleborateR
plot_transmission_network <- function(snp_graph, kleborate_data, var1) {
snp_graph %>%
dplyr::left_join(
kleborate_data %>% dplyr::select(all_of(c('Genome Name', var1))),
by=c('name'='Genome Name')) %>%
ggplot2::ggplot(aes(x=x, y=y, xend=xend, yend=yend, label=name)) +
ggnetwork::geom_edges() +
ggnetwork::geom_nodes(
aes(col=.data[[var1]]),
size=3.5, shape=16, alpha=.5
) +
ggnetwork::theme_blank() +
ggplot2::guides(col="none", fill="none")
}
# TO DO. Add options for customising plot
plot_clusters <- function(clusters_data, min_cluster_size = 3) {
# filter
clusters_data %<>%
dplyr::filter(!is.na(Cluster)) %>%
dplyr::add_count(Cluster, name = "cluster_size") %>%
dplyr::filter(cluster_size >= min_cluster_size)
# Calculate point sizes based on the number of isolates on each date per cluster
size_scale_factor <- clusters_data %>%
dplyr::group_by(Cluster, formatted_date) %>%
dplyr::reframe(size_scale = n())
# merge
clusters_data %<>% dplyr::left_join(size_scale_factor, by = c("Cluster", "formatted_date"))
# plot
ggpubr::ggscatter(
clusters_data, x = "formatted_date", y = "ST",
color = "Cluster", palette = "viridis", ellipse = TRUE, ellipse.type = "convex",
# shape = "Country",
size = clusters_data$size_scale, # Use the size scaling factor
legend = "right", ggtheme = theme_bw(),
ylab = "ST",
xlab = "Isolation date"
) + ggplot2::scale_x_date(labels = scales::date_format("%Y-%m"), date_breaks = "4 weeks")
}Set up
# Data
DEMO_SNP_DATA <- "data/demo_data/euscape/pw-euscape-difference-matrix.csv"
DEMO_METADATA <- "data/demo_data/euscape/pw-euscape-metadata.csv"
DEMO_KLEBORATE_DATA <- "data/demo_data/euscape/pw-euscape-kleborate.csv"
# Thresholds
snp_distance_threshold <- 10
temporal_distance_threshold <- 14 # days
# Specs ---------------------------------------------------
REQUIRED_METADATA_COLS <- c('id', 'Year', 'Month', 'Day', 'Country') Load data
Get clusters
# get pairwise snp distance, days between isolates, and shared location
snp_and_epi_data <- get_snp_and_epi_data(snp_data, sample_dates, metadata,
geo_column = "Country")
# get cluster graph based on snp distance, days between isolates, and shared location
epi_snp_graph <- get_cluster_graph(snp_and_epi_data,
dist_column = c('dist', 'days'),
pair_location_column = "pair_location",
dist_threshold = c(snp_distance_threshold, temporal_distance_threshold))
epi_snp_clusters <- get_cluster_membership_from_graph(epi_snp_graph) %>%
dplyr::right_join(metadata, by = 'id') %>%
dplyr::left_join(sample_dates, by = 'id') %>%
dplyr::left_join(kleborate_data, by = c('id' = 'Genome Name'))Cluster proportion
Proportion of cases part of a transmission cluster = 0.31
Plots
# plot snp distance distribution
plot_dist_distribution(snp_and_epi_data, dist_column = "dist", x_label = "Pairwise distance (SNPs)")# plot temporal distance distribution
plot_dist_distribution(snp_and_epi_data, dist_column = "days", x_label = "Pairwise temporal distance (days)")# plot clusters
min_cluster_size <- 5
plot_clusters(epi_snp_clusters, min_cluster_size = min_cluster_size)## Warning in format_fortify(model = model, nodes = nodes, weights = "none", :
## duplicated edges detected